Link: Funding Sucessful Projects
Author: Sethu Iyer
In [1]:
import pandas as pd
import numpy as np
import datetime
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
In [2]:
pd.set_option('display.max_colwidth',100)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
Creating Features from datetime
In [12]:
#First, convert the timestamp to datetime object
unix_cols = ['deadline','state_changed_at','launched_at','created_at']
for cols in unix_cols:
train[cols] = train[cols].apply(lambda timestamp: datetime.datetime.fromtimestamp(int(timestamp)))
test[cols] = test[cols].apply(lambda timestamp: datetime.datetime.fromtimestamp(int(timestamp)))
In [15]:
#time difference between 1) launched_at and created_at 2) deadline and launched_at
train['launch_create'] = train.apply(lambda row: np.log((row['launched_at'] - row['created_at']).total_seconds()),axis=1)
test['launch_create'] = test.apply(lambda row: np.log((row['launched_at'] - row['created_at']).total_seconds()),axis=1)
In [16]:
train['deadline_launch'] = train.apply(lambda row: np.log((row['deadline'] - row['launched_at']).total_seconds()),axis=1)
test['deadline_launch'] = test.apply(lambda row: np.log((row['deadline'] - row['launched_at']).total_seconds()),axis=1)
Now, normalizing the currency
In [17]:
total_currency = train['currency'].append(test['currency'])
print((pd.unique(total_currency)))
In [18]:
conversion_factor={ 'USD': 1.00,
'GBP': 1.28,
'CAD' : 0.75,
'AUD': 0.76,
'NZD': 0.73,
'EUR': 1.12,
'SEK':0.11,
'NOK':0.12,
'DKK':0.15,
'CHF':1.03,
'HKD':0.13,
'SGD': 0.72,
'MXN':0.056}
train['goal'] = train.apply(lambda row : row['goal'] * conversion_factor[row['currency']],axis=1)
test['goal'] = test.apply((lambda row : row['goal'] * conversion_factor[row['currency']]),axis=1)
Now, Creating some text features
In [19]:
train['name_count'] = train['name'].str.split().str.len()
train['desc_count'] = train['desc'].str.split().str.len()
test['name_count'] = test['name'].str.split().str.len()
test['desc_count'] = test['desc'].str.split().str.len()
train['keywords_len'] = train['keywords'].str.len()
train['keywords_count'] = train['keywords'].str.split('-').str.len()
test['keywords_len'] = test['keywords'].str.len()
test['keywords_count'] = test['keywords'].str.split('-').str.len()
Creating more complex text feature
In [24]:
import re
def desc_clean(word):
p1 = re.sub(pattern='(\W+)|(\d+)|(\s+)',repl=' ',string=word)
p1 = p1.lower()
return p1
kickdesc = pd.Series(train['desc'].tolist() + test['desc'].tolist()).astype(str)
kickdesc=kickdesc.map(desc_clean)
In [25]:
stop = set(stopwords.words('english'))
kickdesc = [[x for x in x.split() if x not in stop] for x in kickdesc]
stemmer = SnowballStemmer(language='english')
kickdesc = [[stemmer.stem(x) for x in x] for x in kickdesc]
kickdesc = [[x for x in x if len(x) > 2] for x in kickdesc]
kickdesc = [' '.join(x) for x in kickdesc]
In [26]:
cv = CountVectorizer(max_features=300)
combine=pd.DataFrame(cv.fit_transform(kickdesc).todense())
combine.rename(columns= lambda x: 'variable_'+ str(x), inplace=True)
In [27]:
train_text = combine[:train.shape[0]]
test_text = combine[train.shape[0]:]
test_text.reset_index(drop=True,inplace=True)
Creating some more text features
In [30]:
len_feats = ['name_len','desc_len']
cols_to_use=['name','desc']
for i in np.arange(2):
train[len_feats[i]] = train[cols_to_use[i]].apply(str).apply(len)
test[len_feats[i]] = test[cols_to_use[i]].apply(str).apply(len)
Finalize the training and testing data before training
In [31]:
cols_to_use = ['name_len','desc_len','keywords_len','name_count','desc_count','keywords_count','goal','launch_create','deadline_launch']
target = train['final_status']
train = train[cols_to_use]
test = test[cols_to_use]
In [32]:
X_train = pd.concat([train, train_text],axis=1)
X_test = pd.concat([test, test_text],axis=1)
It's training time!
In [33]:
dtrain = xgb.DMatrix(data=X_train, label = target)
dtest = xgb.DMatrix(data=X_test)
In [34]:
params = {
'objective':'binary:logistic',
'eval_metric':'error',
'eta':0.025,
'max_depth':6,
'subsample':0.7,
'colsample_bytree':0.7,
'min_child_weight':5
}
In [36]:
bst = xgb.cv(params, dtrain, num_boost_round=1000, early_stopping_rounds=40,nfold=5,verbose_eval=10)
In [39]:
bst_train = xgb.train(params, dtrain, num_boost_round=900)
p_test = bst_train.predict(dtest)
In [38]:
sub = pd.DataFrame()
test = pd.read_csv('test.csv')
sub['project_id'] = test['project_id']
sub['final_status'] = p_test
sub['final_status'] = [1 if x > 0.5 else 0 for x in sub['final_status']]
sub.to_csv("xgb_with_python_feats.csv",index=False) #70.60